{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 程序功能：基于数据均值和方差对数据标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import csv\n",
    "from time import *\n",
    "from sklearn.preprocessing import StandardScaler"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取数据集文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据集大小： (4898430, 42)\n",
      "读取文件耗时： 174.93708682060242 s\n"
     ]
    }
   ],
   "source": [
    "begin_time = time()                     # 读取文件开始时间\n",
    "data_numerization = open(\"kddcup.data.numerization_corrected.txt\") \n",
    "lines = data_numerization.readlines()\n",
    "line_nums = len(lines)\n",
    "x_data = np.zeros((line_nums, 42))      # 创建line_nums行 para_num列的矩阵\n",
    "for i in range(line_nums):\n",
    "    line = lines[i].strip().split(',')\n",
    "    x_data[i, :] = line[0:42]           # 获取42个特征\n",
    "data_numerization.close()\n",
    "print('数据集大小：',x_data.shape)\n",
    "\n",
    "# 耗时分析\n",
    "end_time = time()                      # 读取文件结束时间\n",
    "total_time = end_time-begin_time       # 读取文件耗时\n",
    "print('读取文件耗时：',total_time,'s')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 指定GPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "1\n",
      "0\n",
      "GeForce RTX 2060\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from torch import nn\n",
    "print(torch.cuda.is_available())        # 查看GPU是否可用\n",
    "print(torch.cuda.device_count())        # GPU数量， 1\n",
    "print(torch.cuda.current_device())      # 当前GPU的索引\n",
    "print(torch.cuda.get_device_name(0))    # 输出GPU名称\n",
    "\n",
    "device = torch.device('cuda:0')         # 指定device为0号GPU，若使用CPU则填写\"cpu\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 函数功能：数据标准化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 方案一"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 在上CPU，循环计算\n",
    "def Zscore_Normalization(x, n):\n",
    "    if np.std(x) == 0:\n",
    "        x_data[:, n] = 0\n",
    "    else:\n",
    "        i = 0\n",
    "        while i<len(x):\n",
    "            x_data[i][n] = (x[i] - np.mean(x)) / np.std(x)\n",
    "            i = i + 1\n",
    "    print(\"The \", n , \"feature  is normalizing.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 方案二"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 在GPU上并行加速，循环计算\n",
    "def Zscore_Normalization(x, n):\n",
    "    if np.std(x) == 0:\n",
    "        x_data[:, n] = 0\n",
    "    else:\n",
    "        mean = torch.tensor(np.mean(x), device='cuda:0')\n",
    "        std = torch.tensor(np.std(x), device='cuda:0')\n",
    "        x = torch.tensor(x, device='cuda:0')\n",
    "        i = 0\n",
    "        while i<len(x):\n",
    "            x_data[i][n] = ((x[i] - mean) / std).cpu().numpy()\n",
    "            i = i + 1\n",
    "    print(\"The \", n , \"feature  is normalizing.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 方案三"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 在GPU上并行加速，并利用pytorch的Tensor的广播机制做矩阵计算\n",
    "def Zscore_Normalization(x, n):\n",
    "    if np.std(x) == 0:\n",
    "        x_data[:, n] = 0\n",
    "    else:\n",
    "        mean = torch.tensor(np.mean(x), device='cuda:0')\n",
    "        std = torch.tensor(np.std(x), device='cuda:0')\n",
    "        x = torch.tensor(x, device='cuda:0').view(-1,1)\n",
    "        x_data[:, n] = ((x - mean) / std).cpu().numpy().T\n",
    "    print(\"The \", n , \"feature  is normalizing.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 方案四"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 在上CPU，并利用numpy的ndarray数组的广播机制做矩阵计算\n",
    "def Zscore_Normalization(x, n):\n",
    "    if np.std(x) == 0:\n",
    "        x_data[:, n] = 0\n",
    "    else:\n",
    "        x_data[:, n] = (x - np.mean(x)) / np.std(x)\n",
    "    print(\"The \", n , \"feature  is normalizing.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 获取某列特征，并依次标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The  0 feature  is normalizing.\n",
      "The  1 feature  is normalizing.\n",
      "The  2 feature  is normalizing.\n",
      "The  3 feature  is normalizing.\n",
      "The  4 feature  is normalizing.\n",
      "The  5 feature  is normalizing.\n",
      "The  6 feature  is normalizing.\n",
      "The  7 feature  is normalizing.\n",
      "The  8 feature  is normalizing.\n",
      "The  9 feature  is normalizing.\n",
      "The  10 feature  is normalizing.\n",
      "The  11 feature  is normalizing.\n",
      "The  12 feature  is normalizing.\n",
      "The  13 feature  is normalizing.\n",
      "The  14 feature  is normalizing.\n",
      "The  15 feature  is normalizing.\n",
      "The  16 feature  is normalizing.\n",
      "The  17 feature  is normalizing.\n",
      "The  18 feature  is normalizing.\n",
      "The  19 feature  is normalizing.\n",
      "The  20 feature  is normalizing.\n",
      "The  21 feature  is normalizing.\n",
      "The  22 feature  is normalizing.\n",
      "The  23 feature  is normalizing.\n",
      "The  24 feature  is normalizing.\n",
      "The  25 feature  is normalizing.\n",
      "The  26 feature  is normalizing.\n",
      "The  27 feature  is normalizing.\n",
      "The  28 feature  is normalizing.\n",
      "The  29 feature  is normalizing.\n",
      "The  30 feature  is normalizing.\n",
      "The  31 feature  is normalizing.\n",
      "The  32 feature  is normalizing.\n",
      "The  33 feature  is normalizing.\n",
      "The  34 feature  is normalizing.\n",
      "The  35 feature  is normalizing.\n",
      "The  36 feature  is normalizing.\n",
      "The  37 feature  is normalizing.\n",
      "The  38 feature  is normalizing.\n",
      "The  39 feature  is normalizing.\n",
      "The  40 feature  is normalizing.\n",
      "The  41 feature  is normalizing.\n",
      "标准化耗时： 17.756114959716797 s\n"
     ]
    }
   ],
   "source": [
    "begin_time = time()                     # 标准化开始时间\n",
    "for i in range(42):\n",
    "    Zscore_Normalization(x_data[:, i], i)\n",
    "\n",
    "# 耗时分析\n",
    "end_time = time()                      # 标准化结束时间\n",
    "total_time = end_time-begin_time       # 标准化耗时\n",
    "print('标准化耗时：',total_time,'s')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方案五"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 利用Sklearn库的StandardScaler实现数据标准化\n",
    "begin_time = time()                               # 标准化开始时间\n",
    "x_data = StandardScaler().fit_transform(x_data)   # 标准化，返回值为标准化后的数据\n",
    "\n",
    "# 耗时分析\n",
    "end_time = time()                                 # 标准化结束时间\n",
    "total_time = end_time-begin_time                  # 标准化耗时\n",
    "print('标准化耗时：',total_time,'s')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将标准化后的数据集写入文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据标准化done！\n"
     ]
    }
   ],
   "source": [
    "data_normalizing = open(\"kddcup.data.numerization_corrected_normalizing_StandardScaler.txt\",'w', newline='')\n",
    "csv_writer = csv.writer(data_normalizing)\n",
    "i = 0\n",
    "while i<len(x_data[:, 0]):\n",
    "    csv_writer.writerow(x_data[i, :])\n",
    "    i = i + 1\n",
    "data_normalizing.close()\n",
    "print('数据标准化done！')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
